﻿/*  
cd /projects/data_commons/lbd_code/
qsas lbd_2_clean.sas 8 &

Prepared by Feng Lin.

This script merges LBD data with author-created industry classification and MSA2010, and conducts some manual fixes.
*/

libname llbd "/projects/data_commons/lbd/";

/*
---------------------------------------------------------
Load crosswalk and macro
---------------------------------------------------------
*/

proc import out=msa datafile="/projects/data_commons/cw/cw_geo_fips_msa.csv";
run;

data msa;
  set msa;
  drop msa_str;
run;

%include "/projects/data_commons/lbd_code/lbd_0_m_lbd_basic.sas" /source2;
%include "/projects/data_commons/lbd_code/lbd_0_ch_ind.sas" /source2;

/*
--------------------------------------------------------------------------------
Read in raw data
*/

data lbd_raw;
  set llbd.lbd_raw_orig;
run;

/*
--------------------------------------------------------------------------------
Run basic cleaning
*/

%m_lbd_basic(pdt_in=lbd_raw,pdt_out=lbd_raw,p_exna=1);

/*
--------------------------------------------------------------------------------
Fix fips and select fk_naics
*/

data lbd_raw;
  set lbd_raw;
  
  /*REDACTED */
  /* Manual fix for some fips codes by Chang-Tai Hsieh. */
  
  if year = . then delete;
  if year = 1976 then delete;
  /* We drop 1976 for quality issues. */
  /* Now including later years. 201222 */
  est = 1;
  
  if year <= 2009 then fk_naics = substr(fk_naics02,1,6)*1;
  if year >= 2010 & year <= 2014 then fk_naics = substr(fk_naics07,1,6)*1;
  if year >= 2015 then fk_naics = substr(fk_naics12,1,6)*1;
  /* 
  We use the earliest available vintage of fk_naics for each year to reduce the number of splits. 
  Note that the change between fk_naics02 and fk_naics07 are minor, but between these and fk_naics12 are not insignificant,
  which means we cannot simply use the crosswalk created for fk_naics02 and fk_naics07 in years we only have fk_naics12.
  */
  
  /* drop fk_naics02 fk_naics07 fk_naics12; */ /* Keep these variables for convenience */
run;

/*
--------------------------------------------------------------------------------
Merge with MSA2010
*/

proc sort data=msa; by fips;
proc sort data=lbd_raw; by fips;

data lbd_raw;
  merge lbd_raw(in=x) msa; 
  by fips;
  if x = 1;
run;

/*
--------------------------------------------------------------------------------
Merge with industry code

/* REDACTED
Comment denoting how ch_inds are mapped has been redacted */


/* 
----------------------------------------
fk_naics02 and fk_naics07 
*/

data lbd01;
  set lbd_raw;
  if year <= 2014;
run;

%m_merge_ind(data_raw=lbd01, fk_ver= );

/*
----------------------------------------
fk_naics12
*/

data lbd02;
  set lbd_raw;
  if year >= 2015;
run;

%m_merge_ind(data_raw=lbd02, fk_ver=12);

data lbd;
  set lbd01 lbd02;
run;

/* 
--------------------------------------------------------------------------------
Changes codes for industries not defined with fk_naics at all

(There should have been mapped from naics or sic.)
*/

data lbd;
  set lbd;
  /* REDACTED
  ch_ind code changes have been redacted */
  
run;

/* REDACTED
Discussion and code related to a particular ch_ind change has been redacted */

proc sort data=lbd;
  by lbdid descending year;
run;

data lbd;
  set lbd;
  by lbdid;
  fk_naics_f1 = lag1(fk_naics);
  ch_ind_f1 = lag1(ch_ind);
  ch_ind_by_f1 = lag1(ch_ind_by);
  if first.lbdid then fk_naics_f1 = .;
  if first.lbdid then ch_ind_f1 = .;
  if first.lbdid then ch_ind_by_f1 = .;
run;

data lbd;
  set lbd;
  /* REDACTED */
  drop fk_naics_f1 ch_ind_f1 ch_ind_by_f1;
run;

/* REDACTED
Discussion and code related to a particular ch_ind change has been redacted */



proc sort data=lbd;
  by year lbdid;
run;

proc contents data=lbd noprint out=table_contents;
run;

proc print data=table_contents;
run;

data llbd.lbd_clean;
  set lbd;
  if year ~= .;
  if worker ~= .;
run;

/* Drop establishments without year or employment data. (Should have no effect since they are not used.) */


/* End of sas file */
